---
title: "R Notebook"
output: html_notebook
---


## Topic model

For STM the number of observations in content covariate (7897), prevalence covariate (5768), and documents (7897) are not all equal. So removing missings from prevalence covariates and corresponding documents with missing covariates.



```{r, message=FALSE}
library(readtext)
library(quanteda)
library(dplyr)
library(ggplot2)
library(stm)
library(tidytext)
library(haven)
library(data.table)
```


UNGD data are available on the Harvard Dataverse at https://doi.org/10.7910/DVN/0TJX8Y


```{r}


DATA_DIR <- "~/Dropbox/Research/UNGDC projects/UN Data/" 

ungd_files <- readtext(paste0(DATA_DIR, "TXT/*"), 
                                 docvarsfrom = "filenames", 
                                 dvsep="_", 
                                 docvarnames = c("Country", "Session", "Year"))

```


```{r}

covariates <- read_dta("../MasterDS_EU_Feb2018.dta")

covariates$Year <- as.integer(covariates$year)

full_files <- left_join(ungd_files, covariates, by = c("Country"="iso", "Year"))

#Keeping only complete cases for STM model
nn <- as.data.table(full_files)

complete_eu <- na.omit(nn, cols = "eu_total")


corpus <- corpus(complete_eu, text_field = "text") 

```


```{r}
#Tokenization and basic pre-processing
tok.sm <- tokens(corpus, what = "word",
              remove_numbers = TRUE, 
              remove_punct = TRUE,
              remove_symbols = TRUE,
              remove_twitter = TRUE,
              remove_url = TRUE,
              verbose = TRUE)
```

```{r}
#DFM creation from tokens, removing stopwords.
dfm <- dfm(tok.sm, 
           tolower = TRUE,
           remove=stopwords("english"),
           verbose = TRUE)

```



```{r}
dfm.m <- dfm_select(dfm, c("[\\d-]", "[[:punct:]]", "^.{1}$"), 
                           selection = "remove", 
                    valuetype="regex", verbose = TRUE)

```

```{r}
head(featnames(dfm.m),50)
tail(featnames(dfm.m),50)
```



```{r}
stm.dfm <- convert(dfm.m, to = "stm",  docvars = docvars(corpus))
```


```{r eval=FALSE}
search <- searchK(stm.dfm$documents, stm.dfm$vocab, 
                  K = c(3:50), 
                  prevalence = ~ factor(Country) + s(Year) + factor(eu_total),
                  data = stm.dfm$meta)

search.results <- as.data.frame(search$results)
readr::write_csv(search.results, "search.results.csv")
```



```{r eval=FALSE, include=FALSE}
search_results <- read_csv("search.results.csv")
```


#################
##### Figure 11 in supplementary materials
#################


```{r eval=FALSE}
ggplot(search_results, aes(x=semcoh, y=exclus)) +
    geom_point(size=5, shape =1, color = "green") +
  geom_text(aes(label=K), size=2) +
   geom_smooth(method="lm", se = FALSE, color = "red", size = .3) +
  geom_vline(xintercept = mean(search_results$semcoh), size = .2, linetype="dashed") +
    geom_hline(yintercept = mean(search_results$exclus), size = .2, linetype="dashed") +
  theme_bw() +
 # ggtitle("Selecting optimal number of topics") + 
  xlab("Semantic coherence") + ylab("Exclusivity")

ggsave("optimal_topics.pdf")

```




```{r}
topics14 <- stm(stm.dfm$documents, stm.dfm$vocab,  
             prevalence = ~ factor(Country) + s(Year) + factor(eu_total), 
             data = stm.dfm$meta, 
             K = 14, init.type = "Spectral")
```


```{r}
words <- labelTopics(topics14, n = 15)
prob <- as.data.frame(words[1])
frex <- as.data.frame(words[2])
```

```{r}
labelTopics(topics14,n = 5)

```


#################
##### Figure 12 in supplementary materials
#################


```{r}
pdf("topic14_prob_words.pdf", width = 10, height = 7)

plot(topics14,type="summary", labeltype = "prob", 
     xlim = c(0, 1.5), 
     n = 25, 
     text.cex = .4, 
     main = "Top 25 highest prob words")

dev.off()
```


```{r}
pdf("topic14_frex_words.pdf", width = 10, height = 7)

plot(topics14,type="summary", labeltype = "frex", 
     xlim = c(0, 1.5), 
     n = 25, 
     text.cex = .35, 
     main = "Top 25 FREX words")

dev.off()
```




Topic 1: Disarmament; Topic 2: African peace and security; Topic 3: The United Nations, Topic 4: Colonialism and independence; Topic 5: International security; Topic 6: War and peace; Topic 7: Middle East peace; Topic 8: Small Island Developing States (SIDS); Topic 9: Economic development and the United Nations; Topic 10: Africa region; Topic 11: Latin America region; Topic 12: International development and the Global South; Topic 13: Europe region; Topic 14: Sustainable development and climate change

```{r}
topiclabels <- c("Disarmament", "African peace and security", "Pan-Asian cooperation", "Colonialism and independence", "International security", "Conflict and terrorism", "Middle East peace", "Small Island Developing States (SIDS)", "Economic development and the United Nations", "Africa region", "Latin America region", "International development and the Global South", "European region", "Sustainable development and climate change")
```




```{r}
doc.names <- tidy(topics14, matrix = "gamma", document_names = names(stm.dfm$documents))
colnames(doc.names)[1] <- "country" 
doc.names
```


```{r}
documents <- tidy(topics14, matrix = "gamma", document_names = stm.dfm$meta$eu_total)
topics <- cbind(documents, doc.names)
topics[5:6] <- NULL
topics
```



```{r}
gamma <- stringr::str_replace(topics$country, ".txt", "") %>% 
  stringr::str_split(., "_", simplify = TRUE) %>% 
  cbind(., topics)

gamma$`1` <- as.character(gamma$`1`)
gamma$`2` <- as.numeric(as.character(gamma$`2`))
gamma$`3` <- as.numeric(as.character(gamma$`3`))

colnames(gamma)[1] <- "Country"
colnames(gamma)[2] <- "Session"
colnames(gamma)[3] <- "Year"
colnames(gamma)[4] <- "membership"

gamma$membership[gamma$Country=="USA"] <- 4
gamma$membership[gamma$Country=="RUS"] <- 5

```

```{r}
sums <- gamma %>%  group_by(membership,topic) %>% summarise(Topic_Proportion = mean(gamma)) %>% arrange(topic, Topic_Proportion) %>% filter(membership !=0)

topic_distributions <- as.data.frame(sums)
readr::write_csv(topic_distributions, "topic_distributions.csv")

knitr::kable(sums)
```

```{r}
topic_names <- as_labeller(c(
                    `1` = "Disarmament",
                    `2` = "African peace and security",
                    `3` = "Pan-Asian cooperation",
                    `4` = "Colonialism and independence", 
                    `5` = "International security",
                    `6` = "Conflict and terrorism",
                    `7` = "Middle East peace",
                    `8` = "Small Island Developing States (SIDS)",
                    `9` = "Economic development and the United Nations",
                    `10` = "Africa region",
                    `11` = "Latin America region",
                    `12` = "International development and the Global South",
                    `13` = "European region",
                    `14` = "Sustainable development and climate change"
                    ))
```


#################
##### Figure 13 in supplementary materials
#################



```{r}
ggplot(sums, aes(x = reorder(membership, Topic_Proportion), 
                 y = Topic_Proportion, fill = factor(topic))) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free", ncol=2, labeller = topic_names) +
    coord_flip() + labs(x="", y="") +
   scale_x_discrete(labels=c( "1" = "Applicant", "2" = "Candidate",
                              "3" = "Member", "4" = "USA", "5" = "RUS"), 
                    limits = c(1,2,3,4,5))

ggsave("topic_usage.pdf", width = 8, height = 14)
```





```{r}
EU6 <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD")
EU9 <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD", "DNK", "IRL", "GBR")
EU12 <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD", "DNK", "IRL", "GBR", "GRC","ESP", "PRT")
EU15 <- c("BEL", "FRA", "DEU", "ITA", "LUX", "NLD", "DNK", "IRL", "GBR", "GRC","ESP", "PRT","AUT", "FIN", "SWE")
wave1 <- c("DNK", "IRL", "GBR")
wave2 <- "GRC" 
wave3 <- c("ESP", "PRT") 
wave4 <- c("AUT", "FIN", "SWE") 
wave5 <- c("CZE", "HUN", "POL", "EST", "LVA", "LTU", "CYP", "MLT", "SVK", "SVN")
wave6 <- c("BGR", "ROU") 
wave7 <- "HRV" 
```



For a group of EU related topics

```{r}
topics_eu <- c("2","9", "10", "11", "14")
gamma$topics_eu <- gamma$topic %in% topics_eu

topic_sums <- gamma %>% group_by(topics_eu, Country, Year) %>% summarise(sum_eu_topics = sum(gamma)) %>% filter(topics_eu==TRUE, Year>1990) %>% arrange(Country, Year, sum_eu_topics) 

topic_sums$eu10 <- topic_sums$Country %in% wave5
topic_sums$eu6 <- topic_sums$Country %in% EU6
topic_sums$eu9 <- topic_sums$Country %in% EU9
topic_sums$eu12 <- topic_sums$Country %in% EU12
topic_sums$eu15 <- topic_sums$Country %in% EU15
topic_sums$usa <- topic_sums$Country=="USA"
topic_sums$rus <- topic_sums$Country=="RUS"

eu10_mean_topic <- topic_sums %>% group_by(eu10, Year) %>% summarise(eu10_tp = mean(sum_eu_topics)) %>% filter(eu10==TRUE) 

eu6_mean_topic <- topic_sums %>% group_by(eu6, Year) %>% summarise(eu6_tp = mean(sum_eu_topics)) %>% filter(eu6==TRUE) 

eu9_mean_topic <- topic_sums %>% group_by(eu9, Year) %>% summarise(eu9_tp = mean(sum_eu_topics)) %>% filter(eu9==TRUE) 

eu12_mean_topic <- topic_sums %>% group_by(eu12, Year) %>% summarise(eu12_tp = mean(sum_eu_topics)) %>% filter(eu12==TRUE) 

eu15_mean_topic <- topic_sums %>% group_by(eu15, Year) %>% summarise(eu15_tp = mean(sum_eu_topics)) %>% filter(eu15==TRUE)

usa_mean_topic <- topic_sums %>% group_by(usa, Year) %>% summarise(usa_tp = mean(sum_eu_topics)) %>% filter(usa==TRUE)

rus_mean_topic <- topic_sums %>% group_by(rus, Year) %>% summarise(rus_tp = mean(sum_eu_topics)) %>% filter(rus==TRUE)

mean_topics <- bind_cols(eu10_mean_topic, eu6_mean_topic, eu9_mean_topic, eu12_mean_topic, eu15_mean_topic, usa_mean_topic, rus_mean_topic)

```






#################
##### Figure 6 (and figure 19 in supplementary materials)
#################


```{r}
ggplot(mean_topics, aes(x=Year)) +
  geom_area(aes(y=eu10_tp), fill = "red", alpha=0.3) +
  ylab("EU Topics Proportion") +
  scale_x_continuous(breaks=c(1990,2000, 2010)) +
#  ggtitle("Total EU topic proportions average per year per country group: EU10") +
  theme_bw()

ggsave("eu10_topic_proportions.pdf")
```



```{r}
ggplot(mean_topics, aes(x=Year)) +
  geom_area(aes(y=eu10_tp), fill = "red", alpha=0.3) +
  geom_area(aes(y=eu6_tp), fill = "blue", alpha=0.3) +
  geom_line(aes(y=usa_tp), colour = "black") +
  geom_line(aes(y=rus_tp), colour = "black", linetype = "dashed") +
  ylab("EU Topics Proportion") +
  scale_x_continuous(breaks=c(1990,2000, 2010)) +
#  ggtitle("Total EU topic proportions average per year per country group: EU10 (red) vs EU6") +
  theme_bw()

ggsave("eu10_eu6_topic_proportions_rus_usa.pdf")
```






#################
##### Figure 18 in supplementary materials
#################




```{r}
ggplot(mean_topics, aes(x=Year)) +
  geom_area(aes(y=eu10_tp), fill = "red", alpha=0.3) +
  geom_area(aes(y=eu6_tp), fill = "blue", alpha=0.3) +
  ylab("EU Topics Proportion") +
    scale_x_continuous(breaks=c(1990,2000, 2010)) +
#  ggtitle("Total EU topic proportions average per year per country group: EU10 (red) vs EU6") +
  theme_bw()

ggsave("eu10_eu6_topic_proportions.pdf")
```



